Setup

Packages, Data, Maps

Load the PM data and project using CA Albers projection.

library(ggplot2)
library(leaflet)
library(proj4)
library(geoR)
library(dplyr)
library(fields)
library(mgcv)

pm25 = read.csv("/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data/pm25_ca_2018.csv")
pm25.sites = read.csv("/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data/pm25_sites.csv")
pm_ca<-merge(pm25 %>% filter(date=="11/10/18"), pm25.sites, by='site_id')

proj_albers<-"+proj=aea +lat_1=34.0 +lat_2=40.5 +lon_0=-120.0 +x_0=0 +y_0=-4000000 +ellps=GRS80 +datum=NAD83 +units=km"

newcoords<-project(as.matrix(cbind(pm_ca$longitude, pm_ca$latitude)), proj=proj_albers)
pm_ca$x<-newcoords[,1]
pm_ca$y<-newcoords[,2]

Make a map of the PM2.5 concentrations

pm.pal = colorNumeric(c('darkgreen','goldenrod','brown','brown'),
                        domain=pm_ca$pm25)
  leaflet(pm_ca) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addCircles(lat=~latitude,lng=~longitude,label=~paste0(pm25, ' ug/m3'), color=~pm.pal(pm25),
             opacity=1, fillOpacity=1, radius=500) %>%
  addLegend('bottomleft', pal=pm.pal, values=pm_ca$pm25,
            title='PM2.5 (ug/m3)', opacity=1)

GeoData

Based on our previous results we will use the log PM2.5 concentrations and the Albers projected coordinates.

# log pm25 better, set up geodata object
pm_ca$logpm25<-log(pm_ca$pm25)
lpm_geo_alb<-as.geodata(pm_ca,coords.col=c(7,8),data.col=9)
plot(lpm_geo_alb)

summary(lpm_geo_alb)
## Number of data points: 68 
## 
## Coordinates summary
##             x         y
## min -354.9670 -599.0421
## max  424.1254  415.1952
## 
## Distance summary
##         min         max 
##    3.607555 1187.488641 
## 
## Data summary
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 1.481605 2.314549 2.886934 2.972054 3.385151 5.204556

Grid creation

Create a grid for kriging/interpolation

res=60
xs=seq(min(pm_ca$x),max(pm_ca$x),len=res)
ys=seq(min(pm_ca$y),max(pm_ca$y),len=res)
myGrid=expand.grid(xs,ys)
names(myGrid)=c('x','y')

# make sure it looks correct
plot(myGrid, pch=19, cex=0.5)

Simple and Ordinary Kriging WLS

Let’s krig using the binned semivariogram fit via WLS

  • Binned semivariogram (variog), fit theoretical by WLS (variofit)
  • Set up krige.control and then krige.conv
  • Simple Kriging has constant known/fixed mean (it must be specified)
  • Ordinary Kriging has constant mean but the mean is estimated
# binned semivariogram with C-H robust
# max dist 200 km, use 12 bins
vario2<-variog(lpm_geo_alb,uvec=seq(0,200,l=12),option="bin", estimator.type="modulus")
## variog: computing omnidirectional variogram
plot(vario2,xlab="Distance (h), km")

# fitting semivariogram by wls
# change model type to gaussian
vfit_wls2=variofit(vario2,ini.cov.pars=c(0.5,100),nugget=0.05,fix.nugget=FALSE,cov.model='spherical',weights='cressie')
## variofit: covariance model used is spherical 
## variofit: weights used: cressie 
## variofit: minimisation function used: optim
plot(vario2,xlab="Distance (h), km")
lines(vfit_wls2,col="cyan",lwd=1.5)

summary(vfit_wls2)
## $pmethod
## [1] "WLS (weighted least squares)"
## 
## $cov.model
## [1] "spherical"
## 
## $spatial.component
##     sigmasq         phi 
##   0.6196633 250.6438244 
## 
## $spatial.component.extra
## kappa 
##   0.5 
## 
## $nugget.component
##      tausq 
## 0.01921007 
## 
## $fix.nugget
## [1] FALSE
## 
## $fix.kappa
## [1] TRUE
## 
## $practicalRange
## [1] 250.6438
## 
## $sum.of.squares
##    value 
## 15.11108 
## 
## $estimated.pars
##        tausq      sigmasq          phi 
##   0.01921007   0.61966327 250.64382442 
## 
## $weights
## [1] "cressie"
## 
## $call
## variofit(vario = vario2, ini.cov.pars = c(0.5, 100), cov.model = "spherical", 
##     fix.nugget = FALSE, nugget = 0.05, weights = "cressie")
## 
## attr(,"class")
## [1] "summary.variomodel"
# Simple kriging (constant known mean must be specified)
KCsimple<-krige.control(type.krige="sk",obj.m=vfit_wls2,beta=2)

# locations= are the unobserved locations where we want to make predictions
simple_krig<-krige.conv(lpm_geo_alb,locations=myGrid,krige=KCsimple, output=output.control(signal=TRUE))
## krige.conv: model with constant mean
## krige.conv: Kriging performed using global neighbourhood
# plot gridded predictions
image.plot(xs,ys,matrix(simple_krig$predict,res,res,byrow=FALSE),col=tim.colors(32), main="Simple Kriging")

# ordinary kriging, default in krig.control

KCord<-krige.control(type.krige='ok',obj.m=vfit_wls2)
ordinary_krige<-krige.conv(lpm_geo_alb,locations=myGrid,krige=KCord,output=output.control(signal=TRUE))
## krige.conv: model with constant mean
## krige.conv: Kriging performed using global neighbourhood
# plot gridded predictions
image.plot(xs,ys,matrix(ordinary_krige$predict,res,res,byrow=FALSE),col=tim.colors(32), main="Ordinary Kriging")

# plot standard errors
image.plot(xs,ys,matrix(sqrt(ordinary_krige$krige.var),res,res,byrow=FALSE),col=tim.colors(32))

Simple and Ordinary Kriging ML

Let’s krig using all of the data (cloud semivariogram)

  • First fit the semivariogram with likfit
  • We also examine the profile likelihood, which provides estimates within a window of initial parameter guesses
  • Set up krige.control and then krige.conv for ML or REML
mlfit_gau=likfit(lpm_geo_alb,ini.cov.pars=c(0.5,100),nugget=0.05, fix.nugget=FALSE,cov.model='gaussian', lik.method='ML')
## kappa not used for the gaussian correlation function
## ---------------------------------------------------------------
## likfit: likelihood maximisation using the function optim.
## likfit: Use control() to pass additional
##          arguments for the maximisation function.
##         For further details see documentation for optim.
## likfit: It is highly advisable to run this function several
##         times with different initial values for the parameters.
## likfit: WARNING: This step can be time demanding!
## ---------------------------------------------------------------
## likfit: end of numerical maximisation.
summary(mlfit_gau)
## Summary of the parameter estimation
## -----------------------------------
## Estimation method: maximum likelihood 
## 
## Parameters of the mean component (trend):
##   beta 
## 2.9326 
## 
## Parameters of the spatial component:
##    correlation function: gaussian
##       (estimated) variance parameter sigmasq (partial sill) =  0.6781
##       (estimated) cor. fct. parameter phi (range parameter)  =  129.5
##    anisotropy parameters:
##       (fixed) anisotropy angle = 0  ( 0 degrees )
##       (fixed) anisotropy ratio = 1
## 
## Parameter of the error component:
##       (estimated) nugget =  0.1024
## 
## Transformation parameter:
##       (fixed) Box-Cox parameter = 1 (no transformation)
## 
## Practical Range with cor=0.05 for asymptotic range: 224.2018
## 
## Maximised Likelihood:
##    log.L n.params      AIC      BIC 
## "-49.86"      "4"  "107.7"  "116.6" 
## 
## non spatial model:
##    log.L n.params      AIC      BIC 
## "-87.11"      "2"  "178.2"  "182.7" 
## 
## Call:
## likfit(geodata = lpm_geo_alb, ini.cov.pars = c(0.5, 100), fix.nugget = FALSE, 
##     nugget = 0.05, cov.model = "gaussian", lik.method = "ML")
# uni.only=FALSE means it will calculate 2-D profiles, which is not necessary, remove or set to true for faster computation
prof <- proflik(mlfit_gau, geodata = lpm_geo_alb, sill.values = seq(0.4,0.8,l=5), range.values = seq(50, 150, l=5), nugget.values=seq(0.05,0.2,l=5), uni.only = FALSE) 
## proflik: computing profile likelihood for the sill
## proflik: computing profile likelihood for the range
## proflik: computing profile likelihood for the nugget
## proflik: computing 2-D profile likelihood for the sill and range parameters
## proflik: computing 2-D profile likelihood for the sill and nugget
## proflik: computing 2-D profile likelihood for the range and nugget
par(mfrow=c(2,3))
plot(prof)

# parameters with highest likelihood
prof$range$est.range[1]
## [1] 129.5351
prof$sill$est.sill[1]
## [1] 0.678105
prof$nugget$est.nugget[1]
## [1] 0.1024372
# Kriging by Maximum likelihood
# Simple kriging (constant known mean must be specified)
KCsimple_ml<-krige.control(type.krige="sk",obj.m=mlfit_gau,beta=2)
# locations= are the unobserved locations where we want to make predictions
simple_krig_ml<-krige.conv(lpm_geo_alb,locations=myGrid,krige=KCsimple_ml)
## krige.conv: model with constant mean
## krige.conv: Kriging performed using global neighbourhood
par(mfrow=c(1,4))
#plot on grid
image.plot(xs,ys,matrix(simple_krig_ml$predict,res,res,byrow=FALSE),col=tim.colors(32), main="Simple Kriging")
#to save kriging predicted vals
myGrid2<-data.frame(myGrid,simple_krig$predict, simple_krig$krige.var)

# Ordinary kriging (constant unknown mean)
KCord_ml<-krige.control(type.krige='ok',obj.m=mlfit_gau)
ordinary_krige_ml<-krige.conv(lpm_geo_alb,locations=myGrid,krige=KCord_ml)
## krige.conv: model with constant mean
## krige.conv: Kriging performed using global neighbourhood
# plot gridded predictions
image.plot(xs,ys,matrix(ordinary_krige_ml$predict,res,res,byrow=FALSE),col=tim.colors(32), main="Ordinary Kriging by ML")
# plot standard errors
image.plot(xs,ys,matrix(sqrt(ordinary_krige_ml$krige.var),res,res,byrow=FALSE),col=tim.colors(32), main="Ordinary Kriging ML Errors")

# doing it in one step using the best profile likelihood estimates

krige_ml2 <- krige.conv(lpm_geo_alb, loc=myGrid, 
                 krige=krige.control(type.krige="ok",cov.pars=c(0.678105, 129.5351), nugget=0.1024372))
## krige.conv: model with constant mean
## krige.conv: Kriging performed using global neighbourhood
image.plot(xs,ys,matrix(krige_ml2$predict,res,res,byrow=FALSE),col=tim.colors(32), main="Ordinary Kriging by ML")

Universal Kriging ML

  • We can assess separately whether there is a trend to decide whether we should krig with trend
  • To add a trend we still specify type.krige=‘OK’ but specify trend.d and trend.l components
  • trend.d specifies the trend in the observed locations
  • trend.l specifies the trend at the prediction locations (must be the sae as trend.d)
  • We can also define the mean function by some covariates
# check trends first
summary(lm(logpm25~x+y,data=pm_ca))
## 
## Call:
## lm(formula = logpm25 ~ x + y, data = pm_ca)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1269 -0.3841  0.0983  0.4314  1.7051 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.1276876  0.1341716  23.311  < 2e-16 ***
## x           -0.0034878  0.0010304  -3.385  0.00121 ** 
## y           -0.0006420  0.0007108  -0.903  0.36973    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7216 on 65 degrees of freedom
## Multiple R-squared:  0.3441, Adjusted R-squared:  0.324 
## F-statistic: 17.05 on 2 and 65 DF,  p-value: 1.113e-06
summary(lm(logpm25~x+I(x^2)+y+I(y^2)+ I(x*y),data=pm_ca))
## 
## Call:
## lm(formula = logpm25 ~ x + I(x^2) + y + I(y^2) + I(x * y), data = pm_ca)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.65265 -0.34489 -0.04261  0.35548  1.32363 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.507e+00  1.361e-01  25.772  < 2e-16 ***
## x           -7.441e-03  1.370e-03  -5.433 9.86e-07 ***
## I(x^2)      -2.673e-05  6.697e-06  -3.992 0.000176 ***
## y           -4.843e-03  9.773e-04  -4.956 5.88e-06 ***
## I(y^2)      -1.815e-05  3.215e-06  -5.644 4.40e-07 ***
## I(x * y)    -3.775e-05  8.505e-06  -4.438 3.80e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5736 on 62 degrees of freedom
## Multiple R-squared:  0.6047, Adjusted R-squared:  0.5728 
## F-statistic: 18.97 on 5 and 62 DF,  p-value: 2.14e-11
mlfit_trend_gau=likfit(lpm_geo_alb,ini.cov.pars=c(0.5,100),nugget=0.05, fix.nugget=FALSE, cov.model='gaussian', lik.method='ML',trend= '1st' )
## kappa not used for the gaussian correlation function
## ---------------------------------------------------------------
## likfit: likelihood maximisation using the function optim.
## likfit: Use control() to pass additional
##          arguments for the maximisation function.
##         For further details see documentation for optim.
## likfit: It is highly advisable to run this function several
##         times with different initial values for the parameters.
## likfit: WARNING: This step can be time demanding!
## ---------------------------------------------------------------
## likfit: end of numerical maximisation.
summary(mlfit_trend_gau)
## Summary of the parameter estimation
## -----------------------------------
## Estimation method: maximum likelihood 
## 
## Parameters of the mean component (trend):
##   beta0   beta1   beta2 
##  2.8429 -0.0041 -0.0015 
## 
## Parameters of the spatial component:
##    correlation function: gaussian
##       (estimated) variance parameter sigmasq (partial sill) =  0.4671
##       (estimated) cor. fct. parameter phi (range parameter)  =  75.57
##    anisotropy parameters:
##       (fixed) anisotropy angle = 0  ( 0 degrees )
##       (fixed) anisotropy ratio = 1
## 
## Parameter of the error component:
##       (estimated) nugget =  0.0654
## 
## Transformation parameter:
##       (fixed) Box-Cox parameter = 1 (no transformation)
## 
## Practical Range with cor=0.05 for asymptotic range: 130.8012
## 
## Maximised Likelihood:
##    log.L n.params      AIC      BIC 
##  "-46.3"      "6"  "104.6"  "117.9" 
## 
## non spatial model:
##    log.L n.params      AIC      BIC 
## "-72.77"      "4"  "153.5"  "162.4" 
## 
## Call:
## likfit(geodata = lpm_geo_alb, trend = "1st", ini.cov.pars = c(0.5, 
##     100), fix.nugget = FALSE, nugget = 0.05, cov.model = "gaussian", 
##     lik.method = "ML")
# suppose you only want to add trend in one direction (x or y). Here x is significant in linear trend check.
mlfit_trendx_gau=likfit(lpm_geo_alb,ini.cov.pars=c(0.5,100),nugget=0.05, fix.nugget=FALSE, cov.model='gaussian', lik.method='ML',trend= ~lpm_geo_alb$coords[,1] )
## kappa not used for the gaussian correlation function
## ---------------------------------------------------------------
## likfit: likelihood maximisation using the function optim.
## likfit: Use control() to pass additional
##          arguments for the maximisation function.
##         For further details see documentation for optim.
## likfit: It is highly advisable to run this function several
##         times with different initial values for the parameters.
## likfit: WARNING: This step can be time demanding!
## ---------------------------------------------------------------
## likfit: end of numerical maximisation.
# kriging with linear trend
kriged_grid_trend=krige.conv(lpm_geo_alb,locations=myGrid,krige=krige.control(obj.model=mlfit_trend_gau,trend.d='1st',trend.l='1st'))
## krige.conv: model with mean given by a 1st order polynomial on the coordinates
## krige.conv: Kriging performed using global neighbourhood
# kriging with quadratic trend

mlfit_trend2_gau=likfit(lpm_geo_alb,ini.cov.pars=c(0.5,100),nugget=0.05, fix.nugget=FALSE, cov.model='gaussian', lik.method='ML',trend= '2nd' )
## kappa not used for the gaussian correlation function
## ---------------------------------------------------------------
## likfit: likelihood maximisation using the function optim.
## likfit: Use control() to pass additional
##          arguments for the maximisation function.
##         For further details see documentation for optim.
## likfit: It is highly advisable to run this function several
##         times with different initial values for the parameters.
## likfit: WARNING: This step can be time demanding!
## ---------------------------------------------------------------
## likfit: end of numerical maximisation.
# kriging with quadratic trend (seems to be a better fit)
kriged_grid_trend2=krige.conv(lpm_geo_alb,locations=myGrid,krige=krige.control(obj.model=mlfit_trend2_gau,trend.d='2nd',trend.l='2nd'))
## krige.conv: model with mean given by a 2nd order polynomial on the coordinates
## krige.conv: Kriging performed using global neighbourhood
# have problem with negative predictions, set them to 0
kriged_grid_trend2$predict2<-ifelse(kriged_grid_trend2$predict<0,0,kriged_grid_trend2$predict)
# plot gridded predictions
image.plot(xs,ys,matrix(kriged_grid_trend2$predict2,res,res,byrow=FALSE),col=tim.colors(32), main="Universal Kriging by ML Quad Trend")

Map of Kriging Predictions

Displaying the kriging predictions on a leaflet map requires:

  • transformation of the coordinates back to lat and lon
  • back transforming the predictions if necessary (in this case exponentiate since we log transformed)
proj_albers<-"+proj=aea +lat_1=34.0 +lat_2=40.5 +lon_0=-120.0 +x_0=0 +y_0=-4000000 +ellps=GRS80 +datum=NAD83 +units=km"
myGrid_ll<- project(myGrid, proj_albers, inverse=TRUE)


preds = data.frame(myGrid_ll, pred.pm=exp(kriged_grid_trend2$predict2) %>% as.vector)
pred.pal = colorNumeric(c('darkgreen','gold1','brown'),
                             domain=preds$pred.pm)

  leaflet(preds) %>% 
  addProviderTiles("CartoDB.Positron") %>% 
  addCircles(lng=~x, lat=~y, color=~pred.pal(pred.pm),
             opacity=.7, fillOpacity=.7, radius=1e3) %>% # points w/ 1KM radius
  addLegend('bottomleft', pal=pred.pal, values=preds$pred.pm,
            title="Pred. PM2.5", opacity=.5)

Kriging with Covariate

We can conduct universal kriging with covariates (sometimes called spatial regression), and it may include trend plus covariates. To illustrate we first simulate an elevation variable.

  • The key thing to remember is that if you krig with covariates, the covariate values must be available at the prediction locations!
# Create of random elevation variable to use as a spatially varying covariate in universal kriging
pm_ca$elev<-runif(68,0,2000)

# need to add covariate to geodata obj
lpmelev_geo_alb<-as.geodata(pm_ca,coords.col=c(7,8), data.col=9, covar.col=10, covar.names="elev")
plot(lpmelev_geo_alb)

# check association
mod_elev<-lm(logpm25~elev,dat=pm_ca)
summary(mod_elev)
## 
## Call:
## lm(formula = logpm25 ~ elev, data = pm_ca)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.46496 -0.67533 -0.06067  0.38534  2.20571 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.015e+00  2.126e-01  14.181   <2e-16 ***
## elev        -4.428e-05  1.875e-04  -0.236    0.814    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8839 on 66 degrees of freedom
## Multiple R-squared:  0.0008441,  Adjusted R-squared:  -0.01429 
## F-statistic: 0.05576 on 1 and 66 DF,  p-value: 0.8141
mlfit_elev_gau=likfit(lpmelev_geo_alb,ini.cov.pars=c(0.5,100),nugget=0.05, fix.nugget=FALSE, cov.model='gaussian', lik.method='ML',trend= ~elev)
## kappa not used for the gaussian correlation function
## ---------------------------------------------------------------
## likfit: likelihood maximisation using the function optim.
## likfit: Use control() to pass additional
##          arguments for the maximisation function.
##         For further details see documentation for optim.
## likfit: It is highly advisable to run this function several
##         times with different initial values for the parameters.
## likfit: WARNING: This step can be time demanding!
## ---------------------------------------------------------------
## likfit: end of numerical maximisation.
summary(mlfit_elev_gau)
## Summary of the parameter estimation
## -----------------------------------
## Estimation method: maximum likelihood 
## 
## Parameters of the mean component (trend):
##   beta0   beta1 
##  3.0087 -0.0001 
## 
## Parameters of the spatial component:
##    correlation function: gaussian
##       (estimated) variance parameter sigmasq (partial sill) =  0.6924
##       (estimated) cor. fct. parameter phi (range parameter)  =  130.9
##    anisotropy parameters:
##       (fixed) anisotropy angle = 0  ( 0 degrees )
##       (fixed) anisotropy ratio = 1
## 
## Parameter of the error component:
##       (estimated) nugget =  0.1005
## 
## Transformation parameter:
##       (fixed) Box-Cox parameter = 1 (no transformation)
## 
## Practical Range with cor=0.05 for asymptotic range: 226.6275
## 
## Maximised Likelihood:
##    log.L n.params      AIC      BIC 
## "-49.47"      "5"  "108.9"    "120" 
## 
## non spatial model:
##    log.L n.params      AIC      BIC 
## "-87.08"      "3"  "180.2"  "186.8" 
## 
## Call:
## likfit(geodata = lpmelev_geo_alb, trend = ~elev, ini.cov.pars = c(0.5, 
##     100), fix.nugget = FALSE, nugget = 0.05, cov.model = "gaussian", 
##     lik.method = "ML")
KCelev<-krige.control(obj.m=mlfit_elev_gau,trend.d= ~elev,trend.l=~elev)
#universal_krige2<-krige.conv(lpmelev_geo_alb,locations=myGrid,krige=KCelev)
# Note we cannot do this because we don't have trend values (elevation) at all of the prediction points.

# universal krigging with trend and covariate predict at point locations
mlfit_elev2_gau=likfit(lpmelev_geo_alb,ini.cov.pars=c(0.5,100),nugget=0.05, fix.nugget=FALSE, cov.model='gaussian', lik.method='ML',trend= trend.spatial("2nd", lpmelev_geo_alb, add=~elev))
## kappa not used for the gaussian correlation function
## ---------------------------------------------------------------
## likfit: likelihood maximisation using the function optim.
## likfit: Use control() to pass additional
##          arguments for the maximisation function.
##         For further details see documentation for optim.
## likfit: It is highly advisable to run this function several
##         times with different initial values for the parameters.
## likfit: WARNING: This step can be time demanding!
## ---------------------------------------------------------------
## likfit: end of numerical maximisation.
summary(mlfit_elev2_gau)
## Summary of the parameter estimation
## -----------------------------------
## Estimation method: maximum likelihood 
## 
## Parameters of the mean component (trend):
##   beta0   beta1   beta2   beta3   beta4   beta5   beta6 
##  3.5150 -0.0077 -0.0049  0.0000  0.0000  0.0000 -0.0001 
## 
## Parameters of the spatial component:
##    correlation function: gaussian
##       (estimated) variance parameter sigmasq (partial sill) =  0.2761
##       (estimated) cor. fct. parameter phi (range parameter)  =  36.88
##    anisotropy parameters:
##       (fixed) anisotropy angle = 0  ( 0 degrees )
##       (fixed) anisotropy ratio = 1
## 
## Parameter of the error component:
##       (estimated) nugget =  0.0295
## 
## Transformation parameter:
##       (fixed) Box-Cox parameter = 1 (no transformation)
## 
## Practical Range with cor=0.05 for asymptotic range: 63.82743
## 
## Maximised Likelihood:
##    log.L n.params      AIC      BIC 
## "-38.42"     "10"  "96.84"    "119" 
## 
## non spatial model:
##    log.L n.params      AIC      BIC 
## "-55.02"      "8"    "126"  "143.8" 
## 
## Call:
## likfit(geodata = lpmelev_geo_alb, trend = trend.spatial("2nd", 
##     lpmelev_geo_alb, add = ~elev), ini.cov.pars = c(0.5, 100), 
##     fix.nugget = FALSE, nugget = 0.05, cov.model = "gaussian", 
##     lik.method = "ML")

Cross validation

This is the setup for creating a 70-30 traning testing split:

  • build kriging model on train, prediction locations are test
  • compare the observed and predictions (homework 2)
  • also look into geoR function xvalid
# Split data into test and training sets (70-30)

set.seed(123)
s<-sample(1:nrow(pm_ca),dim(pm_ca)[1]*.7,replace=FALSE)
temp<-1:length(pm_ca$logpm25)
temp[s]<-0
testset<-temp[temp!=0]
test<-pm_ca[testset,]
train<-pm_ca[s,]

IDW Interpolation

Interpolation with inverse distance weighting (IDW) is a simple non-statistical interpolation (weighting) method.

  • In practice one should tune the weight parameter through some type of validation.
  • The function below requires that you create distances between the observed points and between the observed points and the grid to conduct the interpolation
# look at distances with two functions dist and rdist. rdist is in fields library
# distances within PM locations 
# use projected coordinates here

pm_dists<-dist(newcoords)
# distances between PM locations and grid
pm_new_dists<-rdist(newcoords,myGrid)

#inverse distance weighting function

idw<-function(data,locs,newlocs,p){
  dists<-rdist(newlocs,locs)
  return(((dists^(-p))%*%data)/((dists^(-p))%*%rep(1,length(data)))) 
}

# testing different p's
idwPred2<-idw(pm_ca$logpm25,cbind(pm_ca$x,pm_ca$y),myGrid,p=1)
idwPred6<-idw(pm_ca$logpm25,cbind(pm_ca$x,pm_ca$y),myGrid,p=6)
idwPred12<-idw(pm_ca$logpm25,cbind(pm_ca$x,pm_ca$y),myGrid,p=12)

# plotting the results on our grid
image.plot(xs,ys,matrix(idwPred2,res,res),col=tim.colors(32))
points(pm_ca$x,pm_ca$y,pch=19,cex=0.3)

image.plot(xs,ys,matrix(idwPred6,res,res),col=tim.colors(32))
points(pm_ca$x,pm_ca$y,pch=19,cex=0.3)

image.plot(xs,ys,matrix(idwPred12,res,res),col=tim.colors(32))
points(pm_ca$x,pm_ca$y,pch=19,cex=0.3)

GAM

Generalized additive models (GAM) are powerful non-parametric representations of data (sometimes called functional data). We concentrate on non-parametric functions of our predictor variables.

co2<-read.csv("/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Data/co2_mm_mlo.csv")
# CO2 data for time series illustration of basis functions
plot(co2$date_dec,co2$co2,type='l')

co2_2018<-co2[co2$year==2018,]
co2_1998<-co2[co2$year==1998,]
plot(co2_2018$co2)

# Simple polynomial basis
lm_co2=lm(co2~month+I(month^2)+I(month^3),data=co2_2018)
lm_co2$coefficients
##  (Intercept)        month   I(month^2)   I(month^3) 
## 403.12161616   4.60834295  -0.84445943   0.04178969
# Fit polynomial result
month<-seq(1:12)
y <- 403.12161616+ 4.60834295*month -0.84445943*month^2 + 0.04178969*month^3

plot(co2_2018$co2, ylab="CO2, ppm",xlab="Month")
lines(y)

# Use polynomial result to predict for different year
poly_pred<-predict(lm_co2,newdat=co2_1998)

plot(co2_1998$co2,ylim=c(360,410))
lines(poly_pred,col='red')

# not a good fit because we didn't take longer term trend into account

# Using cubic regression spline bases with 4 knots
gam_co2<-gam(co2~s(month,bs="cr"),data=co2_2018)
plot(gam_co2)

# try fitting to all data and smoothing date (overall trends) and month (to get within year trends)
gam_co2_all<-gam(co2~s(date_dec,bs="cr",k=20)+s(month,bs="cc"),data=co2)
summary(gam_co2_all)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## co2 ~ s(date_dec, bs = "cr", k = 20) + s(month, bs = "cc")
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 354.49606    0.01675   21158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df      F p-value    
## s(date_dec) 18.587  18.98 147276  <2e-16 ***
## s(month)     7.231   8.00   1898  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =      1   Deviance explained =  100%
## GCV = 0.21498  Scale est. = 0.20717   n = 738
# predict on data 
pred_co2<-predict.gam(gam_co2_all,co2)
plot(pred_co2,type='l')

# Thin plate splines (2-D)
# Let gam() find number of knots
gam_pm<-gam(logpm25~s(x,y,bs="ts"),data=pm_ca)
plot(gam_pm, main="GAM default k")

summary(gam_pm)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## logpm25 ~ s(x, y, bs = "ts")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.97205    0.03935   75.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df    F p-value    
## s(x,y) 22.31     29 14.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.863   Deviance explained = 90.9%
## GCV = 0.16025  Scale est. = 0.10532   n = 68
# Change number of knots so it is more smooth (larger k)  
gam_pm2<-gam(logpm25~s(x,y,bs="ts",k=60),data=pm_ca)
plot(gam_pm2, main="GAM large k")

summary(gam_pm2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## logpm25 ~ s(x, y, bs = "ts", k = 60)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.97205    0.01101     270   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(x,y) 57.25     59 105.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.989   Deviance explained = 99.8%
## GCV = 0.057487  Scale est. = 0.0082417  n = 68
# Use fitted gam object to predict smooth in x,y at grid locations

pred_gam<-predict.gam(gam_pm2,myGrid,se.fit=TRUE)

# Plot predictions
image.plot(xs,ys,matrix(pred_gam$fit,res,res),col=tim.colors(32))
points(pm_ca$x,pm_ca$y,pch=19,cex=0.3)

# Standard errors of predictions
image.plot(xs,ys,matrix(pred_gam$se.fit,res,res),col=tim.colors(32))

# Can also add linear and/or smooth covariates
gam_pm3<-gam(logpm25~s(elev)+s(x,y,bs="ts",k=50),data=pm_ca)
summary(gam_pm3)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## logpm25 ~ s(elev) + s(x, y, bs = "ts", k = 50)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.97205    0.02718   109.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df      F p-value    
## s(elev)  1.00      1  1.676   0.206    
## s(x,y)  39.22     49 19.937  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.935   Deviance explained = 97.4%
## GCV = 0.12752  Scale est. = 0.050225  n = 68
#pred_gam_new3<-predict.gam(gam_pm3,myGrid,se.fit=TRUE)
#image.plot(xs,ys,matrix(pred_gam3$fit,res,res),col=tim.colors(32))
# Note we also have the issue here where we can't predict where don't have elevation.


# make nice map of gam predictions

preds = data.frame(myGrid_ll, pred.pm=exp(pred_gam$fit) %>% as.vector)
pred.pal = colorNumeric(c('darkgreen','gold1','brown'),
                             domain=preds$pred.pm)

  leaflet(preds) %>% 
  addProviderTiles("CartoDB.Positron") %>% 
  addCircles(lng=~x, lat=~y, color=~pred.pal(pred.pm),
             opacity=.7, fillOpacity=.7, radius=1e3) %>% # points w/ 1KM radius
  addLegend('bottomleft', pal=pred.pal, values=preds$pred.pm,
            title="GAM Pred. PM2.5", opacity=.5)